1 Introduction

Loading Data

The following libraries will be used.

library(data.table) # database-like tables
library(ggplot2) # general plots
library(highcharter) # interactive plots
library(DT) # display tables
library(corrplot) # corr SPLOMs
library(vcd) # stats for categories
library(mice) # for imputation

Input is loaded and covnerted to data.table.

test <- read.csv("../input/test.csv")
train <- read.csv("../input/train.csv")
test <- data.table(test)
train <- data.table(train)

2 Data Quality

The number of dimensions and sampling density is calculated. The datasets are combined.

(N <- nrow(train))
## [1] 1460
(p <- ncol(train) - 2)
## [1] 79
N^(1/p)
## [1] 1.096617
test$SalePrice <- NaN
full <- rbind(train, test)

2.1 Feature Attributes

From the data_description variable types can be defined. There are 4 categories: nominal, ordinal, discrete, continuous. A lot of the discrete variables could arguably be handled as ordinals.

nomVars <- c("MSZoning", "Street", "Alley", "LandContour", "LotConfig",
             "Neighborhood", "Condition1", "Condition2", "HouseStyle",
             "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
             "MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical",
             "GarageType", "PavedDrive", "MiscFeature", "SaleType",
             "SaleCondition")
ordVars <- c("MSSubClass", "LotShape", "Utilities", "LandSlope", "BldgType",
             "OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual",
             "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
             "HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
             "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence",
             "MoSold")
discVars <- c("YearBuilt", "YearRemodAdd", "BsmtFullBath", "BsmtHalfBath",
              "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
              "TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
              "YrSold")
contVars <- c("LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
              "TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF",
              "GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF",
              "EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea",
              "MiscVal", "LotFrontage")

A table is created to store feature attributes.

featAttr <- data.table(
  name = colnames(test)[-1],
  type = "nominal"
)
featAttr[name %in% ordVars, type := "ordinal"]
featAttr[name %in% discVars, type := "discrete"]
featAttr[name %in% contVars, type := "continuous"]

Special Values

According to data_description several \(NA\) have a special meaning. These values are replaced with more descriptive chr strings. Before, test and train set are combined.

full$label <- rep(c("train", "test"), c(nrow(test), nrow(train)))
full[is.na(Alley), Alley := "noAccess"]
full[is.na(BsmtQual), BsmtQual := "noBasement"]
full[is.na(BsmtCond), BsmtCond := "noBasement"]
full[is.na(BsmtExposure), BsmtExposure := "noBasement"]
full[is.na(BsmtFinType1), BsmtFinType1 := "noBasement"]
full[is.na(BsmtFinType2), BsmtFinType2 := "noBasement"]
full[is.na(FireplaceQu), FireplaceQu := "noFireplace"]
full[is.na(GarageType), GarageType := "noGarage"]
full[is.na(GarageFinish), GarageFinish := "noGarage"]
full[is.na(GarageQual), GarageQual := "noGarage"]
full[is.na(GarageCond), GarageCond := "noGarage"]
full[is.na(PoolQC) , PoolQC := "noPool"]
full[is.na(Fence), Fence := "noFence"]
full[is.na(MiscFeature), MiscFeature := "none"]

Nominal, ordinal, and discrete variables are converted to factor, continuous to numeric.

for (j in c(nomVars, ordVars, discVars)) full[[j]] <- as.factor(full[[j]])
for (j in contVars) full[[j]] <- as.numeric(full[[j]])

Ordering

From data_description the order of ordinal variable classes can be derived. These orderings are defined, then the levels of their factors are adjusted.

lvlOrd <- list(
  LotShape = c("Reg", "IR1", "IR2", "IR3"),
  Utilities = c("AllPub", "NoSewr", "NoSeWa", "ELO"),
  LandSlope = c("Gtl", "Mod", "Sev"),
  BldgType = c("1Fam", "2FmCon", "Duplx", "TwnhsE", "TwnhsI"),
  OverallQual = 10:1,
  OverallCond = 10:1,
  ExterQual = c("Ex", "Gd", "TA", "Fa", "Po"),
  ExterCond = c("Ex", "Gd", "TA", "Fa", "Po"),
  BsmtQual = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
  BsmtCond = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
  BsmtExposure = c("Gd", "Av", "Mn", "No", "noBasement"),
  BsmtFinType1 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
  BsmtFinType2 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
  HeatingQC = c("Ex", "Gd", "TA", "Fa", "Po"),
  KitchenQual = c("Ex", "Gd", "TA", "Fa", "Po"),
  Functional = c("Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"),
  FireplaceQu = c("Ex", "Gd", "TA", "Fa", "Po", "noFireplace"),
  GarageFinish = c("Fin", "RFn", "Unf", "noGarage"),
  GarageQual = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
  GarageCond = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
  PoolQC = c("Ex", "Gd", "TA", "Fa", "noPool"),
  Fence = c("GdPrv", "MnPrv", "GdWo", "MnWw", "noFence"),
  MoSold = 1:12
)
for (j in names(lvlOrd)) {
  full[[j]] <- factor(full[[j]], levels = lvlOrd[[j]])
}

2.2 Feature Dependencies

Some features’ values heavily depend on the value of another feature. For example the values of \(BsmtUnfSF\), \(TotalBsmtSF\) only make sense if features \(BsmtCond\), \(BsmtExposure\), \(BsmtQual\) are not \(noBasement\). This information is given in data_description.

\[ \begin{aligned} BsmtUnfSF, TotalBsmtSF : BsmtCond, BsmtExposure, BsmtQual \ne noBasement\\ BsmtFinSF1 : BsmtFinType1 \ne noBasement\\ BsmtFinSF2 : BsmtFinType2 \ne noBasement \\ GarageYrBlt, GarageCars, GarageArea: GarageType, GarageFinish, GarageQual, GarageCond \ne noGarage \\ PoolArea : PoolQC \ne noPool \\ MiscVal : MiscFeature \ne none \\ MasVnrArea : MasVnrType \ne none \end{aligned} \] For the garage and basement features there are several features which indicate the absence of that feature. If one of them indicates the absence of a feature, the others should too. This can be tested with the difference between two sets.

outersect <- function(x, y) sort(c(setdiff(x, y), setdiff(y, x)))

The differences between Ids of observations with noBasement are computed.

outersect(which(full$BsmtCond == "noBasement"),
          which(full$BsmtExposure == "noBasement"))
## [1]  949 1488 2041 2186 2349 2525
outersect(which(full$BsmtCond == "noBasement"),
          which(full$BsmtQual == "noBasement"))
## [1] 2041 2186 2218 2219 2525
full[c(949, 1488, 2041, 2186, 2218, 2219, 2349, 2525), 
     .(Id, BsmtCond, BsmtExposure, BsmtQual, BsmtUnfSF, TotalBsmtSF)]
##      Id   BsmtCond BsmtExposure   BsmtQual BsmtUnfSF TotalBsmtSF
## 1:  949         TA   noBasement         Gd       936         936
## 2: 1488         TA   noBasement         Gd      1595        1595
## 3: 2041 noBasement           Mn         Gd         0        1426
## 4: 2186 noBasement           No         TA        94        1127
## 5: 2218         Fa           No noBasement       173         173
## 6: 2219         TA           No noBasement       356         356
## 7: 2349         TA   noBasement         Gd       725         725
## 8: 2525 noBasement           Av         TA       240         995

So here the data_description is not consistent. The general condition is set to noBasement but there are values for basement exposure, quality, and so on. noBasement was translated from NAs. Since for all the other basement related variables there are reasonable values it might be that in these cases NA did not mean noBasement.

The same can be done for the garage related variables.

outersect(which(full$GarageType == "noGarage"),
          which(full$GarageFinish == "noGarage"))
## [1] 2127 2577
outersect(which(full$GarageType == "noGarage"),
          which(full$GarageQual == "noGarage"))
## [1] 2127 2577
outersect(which(full$GarageType == "noGarage"),
          which(full$GarageCond == "noGarage"))
## [1] 2127 2577
full[c(2127, 2577), 
     .(GarageYrBlt, GarageCars, GarageArea, GarageType, 
       GarageFinish, GarageQual, GarageCond)]
##    GarageYrBlt GarageCars GarageArea GarageType GarageFinish GarageQual
## 1:          NA          1        360     Detchd     noGarage   noGarage
## 2:          NA         NA         NA     Detchd     noGarage   noGarage
##    GarageCond
## 1:   noGarage
## 2:   noGarage

Here, most garage related features were missing.

Mark Undefined Variables

If e.g. PoolQC is noPool, then there should be a missing vlaue or a zero for PoolArea. If there actually is a value, this could either mean it is a mistake or it means something special that is not clearly described. To at least be consistent, all NAs will be set to 0.

full[BsmtCond == "noBasement" & is.na(BsmtUnfSF), BsmtUnfSF := 0]
full[BsmtCond == "noBasement" & is.na(TotalBsmtSF), TotalBsmtSF := 0]
full[BsmtFinType1 == "noBasement" & is.na(BsmtFinSF1), BsmtFinSF1 := 0]
full[BsmtFinType2 == "noBasement" & is.na(BsmtFinSF2), BsmtFinSF2 := 0]
full[GarageType == "noGarage" & is.na(GarageYrBlt), GarageYrBlt := "0"]
full[GarageType == "noGarage" & is.na(GarageCars), GarageCars := "0"]
full[GarageType == "noGarage" & is.na(GarageArea), GarageArea := 0]
full[PoolQC == "noPool" & is.na(PoolArea), PoolArea := 0]
full[MiscFeature == "noFeature" & is.na(MiscVal), MiscVal := 0]

2.3 Missing Data

\(NA\) which are left must be true missing data.

naFeats <- apply(full[, !"SalePrice"], 2, function(x) any(is.na(x)))
naObs <- apply(full[, !"SalePrice"], 1, function(x) any(is.na(x)))

Features

The total number of missing values per feature will be saved. The features with at least one missing value are shown below.

total <- apply(full[, naFeats, with = FALSE], 2, function(j) sum(is.na(j)))
df <- data.frame(
  missing = total / nrow(full),
  name = factor(names(total), levels = names(total)[order(total)]),
  type = featAttr[match(names(total), featAttr$name), type]
)
ggplot(df, aes(x = name, y = missing, fill = type)) +
  geom_bar(stat = "identity") +
  scale_y_continuous("proportion missing") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Features with Missing Values")

Observations

Below the number of missing values for observations that have missing values is shown. Many observations have 1 or 2, some have 3 or 4 missing values.

total <- apply(full[naObs, ], 1, function(i) sum(is.na(i)))
df <- data.frame(
  y = total,
  name = full[naObs, Id]
)
highchart() %>%
    hc_chart(type = "column") %>%
    hc_title(text = "Observations with Missing Values") %>%
    hc_xAxis(title = list(text = "observations"), categories = df$name, 
             labels  = list(rotation = "-45")) %>%
    hc_yAxis(title = list(text = "missing"), align = "left") %>%
    hc_add_series(showInLegend = FALSE, data = list_parse(df[order(df$y), ]))

3 Feature Exploration

3.1 Feature Distributions

Feature distributions are described for each type of variable separately.

3.1.1 Continuous Variables

Values were log10 transformed to better capture their spread. There are many zeros, partly because of the dependency to another variable. These zeros are excluded during the transformation.

dt <- melt(full, "Id", contVars)
ggplot(dt, aes(x = value)) +
  geom_histogram(bins = 30) +
  scale_x_continuous(trans = "log10") +
  facet_wrap(~ variable, scale = "free") +
  theme_bw()

Summaries for each distribution are computed. There are many undefined variables which are set to zero. This influences the description a lot. Therefore, they are excluded.

dt <- full[, contVars, with = FALSE]
df <- data.frame(
  Min = apply(dt, 2, function(x) min(x[x > 0], na.rm = TRUE)),
  Mean = apply(dt, 2, function(x) mean(x[x > 0], na.rm = TRUE)),
  Median = apply(dt, 2, function(x) median(x[x > 0], na.rm = TRUE)),
  Max = apply(dt, 2, function(x) max(x[x > 0], na.rm = TRUE))
)

Relative difference between mean and median (relDelta) are computed as an indicator for skewness. Also, relative cardinality and relative number of outliers are computed. Outliers are defined as being differing at least \(+/- 1.5 MAD\) from the median.

df$relDelta <- abs(df$Mean - df$Median) / df$Mean
df$relCard <- apply(dt, 2, function(x) {
  length(unique(x[x > 0])) / length(x[x > 0])
})
df$relOut <- apply(dt, 2, function(x) {
  length(boxplot.stats(x[x > 0])$out) / length(x[x > 0])
})

PoolArea and MiscVal show a high proportion of outliers, but this might be because there are not many total values for these variables. Many supposably continuous variables have a quite low cardinality.

DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>% 
  formatStyle('relDelta', 
    color = styleInterval(c(0.2, 0.8), c('green', 'orange', 'red'))) %>%
  formatStyle('relCard', 
    color = styleInterval(c(0.2, 0.6), c('red', 'orange', 'green'))) %>%
  formatStyle('relOut', 
    color = styleInterval(c(0.05, 0.1), c('green', 'orange', 'red'))) %>%
  formatRound(c(2, 5:7), 3)

3.1.2 Discrete Variables

Distributions are shown below.

dt <- melt(full, "Id", discVars)
ggplot(dt, aes(x = value)) +
  geom_bar() +
  facet_wrap(~ variable, scales = "free") +
  theme_bw()

As summary, the value range, and cardinality are computed.

dt <- full[, discVars, with = FALSE]
df <- data.frame(
  Card = apply(dt, 2, function(x) length(unique(x[!is.na(x)]))),
  Min = apply(dt, 2, function(x) min(as.numeric(as.character(x)), na.rm = TRUE)),
  Max = apply(dt, 2, function(x) max(as.numeric(as.character(x)), na.rm = TRUE))
)

Apart from that, the modes are usually interesting. For this a function has to be defined first.

Mode <- function(x, fst = TRUE) {
  ux <- unique(x)
  tab <- tabulate(match(x, ux))
  modeIdx <- which.max(tab)
  if (fst) return(list(value = ux[modeIdx], freq = tab[modeIdx]))
  tab[modeIdx] <- 0
  modeIdx <- which.max(tab)
  return(list(value = ux[modeIdx], freq = tab[modeIdx]))
}

First and second modes are computed and the table is shown. Modes that make up more than 50% of observations will be highlighted.

df$Mode <- apply(dt, 2, function(x) Mode(x)$value)
df$ModeCount <- apply(dt, 2, function(x) Mode(x)$freq)
df$Mode2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$value)
df$ModeCount2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>% 
  formatStyle('ModeCount', 
    color = styleInterval(nrow(dt) / 2, c("black", 'orange')))

In GarageYrBlt the maximum value is 2207, which does not make sense. It seems that this is a typo and the real value is 2007. After checking that there are not more values above 2010 2207 is changed to 2007.

full[GarageYrBlt == 2207, GarageYrBlt := "2007"]

3.1.3 Categorical Variables

Ordinal Variables

dt <- melt(full, "Id", ordVars)
ggplot(dt, aes(x = value)) +
  geom_bar() +
  facet_wrap(~ variable, scales = "free") +
  theme_bw()

Nominal Variables

dt <- melt(full, "Id", nomVars)
ggplot(dt, aes(x = value)) +
  geom_bar() +
  facet_wrap(~ variable, scales = "free") +
  theme_bw()

Summary

Cardinality, 1st and 2nd modes are given as summary. Modes that make up more than 50% of observations will be highlighted.

dt <- full[, c(ordVars, nomVars), with = FALSE]
df <- data.frame(
  Card = apply(dt, 2, function(x) length(unique(x[!is.na(x)]))),
  Mode = apply(dt, 2, function(x) Mode(x)$value),
  ModeCount = apply(dt, 2, function(x) Mode(x)$freq),
  Mode2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$value),
  ModeCount2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq),
  Type = rep(c("ord", "nom"), c(length(ordVars), length(nomVars)))
)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>% 
  formatStyle('Type', 
    color = styleEqual(unique(df$Type), c('orange', 'blue'))) %>% 
  formatStyle('ModeCount', 
    color = styleInterval(nrow(dt) / 2, c("black", 'orange')))

3.2 Correlations between Features

3.2.1 Continuous

vars <- c(contVars, discVars)

The Pearson Correlation Coefficient is calculated for all feature pairs where both features are continuous. The results are shown as heatmap below. Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link).

m <- cor(
  full[, contVars, with = FALSE],
  use = "complete.obs"
)
corrplot(m, method = "color", order = "hclust")

Some clusters are visible. The same hierarchical clustering is shown as tree below.

corrClust <- hclust(dist(m))
plot(corrClust)

3.2.2 Categorical

Here, discrete variables are included.

vars <- c(discVars, ordVars, nomVars)

For categorical features Cramer’s V is calculated for all feature pairs to indicate correlation. The results are shown as heatmap below.

m <- as.matrix(full[, vars, with = FALSE])
m <- m[complete.cases(m), ]
mCramer <- matrix(NA, ncol(m), ncol(m))
for (i in seq_len(ncol(m))) {
  mCramer[i, ] <- vapply(
    seq_len(ncol(m)), 
    function(j) assocstats(table(m[, i], m[, j]))$cramer, 
    numeric(1)
  )
}

Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link).

colnames(mCramer) <- colnames(m)
rownames(mCramer) <- colnames(m)
corrplot(mCramer, method = "color", order = "hclust")

The tree for the clustering:

corrClust <- hclust(dist(mCramer))
plot(corrClust)


4 Imputation

4.1 Single Imputation

Single missing values in basement related features, where all other basement related features did have a value, are replaced by their mode.

full[c(949, 1488, 2349), BsmtExposure := "No"]
full[c(2041, 2186, 2525), BsmtCond := "TA"]
full[c(2218, 2219), BsmtQual := "TA"]

For garage related features there are two cases with almost no non-missing value for garage. For these two cases the garage features are set to noGarage.

full[c(2127, 2577), GarageYrBlt := "0"]
full[c(2127, 2577), GarageCars := "0"]
full[c(2127, 2577), GarageArea := 0]
full[c(2127, 2577), GarageType := "noGarage"]

Features with only very few missing values will be imputed using the mode for categories and the median for continuous variables.

full[is.na(Exterior1st), Exterior1st := "VinylSd"]
full[is.na(Exterior2nd), Exterior2nd := "VinylSd"]
full[is.na(Electrical), Electrical := "SBrkr"]
full[is.na(KitchenQual), KitchenQual := "TA"]
full[is.na(SaleType), SaleType := "WD"]
full[is.na(Utilities), Utilities := "AllPub"]
full[is.na(Functional), Functional := "Typ"]
full[is.na(MSZoning), MSZoning := "RL"]
full[is.na(MasVnrArea), MasVnrArea := 202]
full[is.na(MasVnrType), MasVnrType := "none"]

There were two observations with NA values for basement bath related variables in which all other basement related variables were set to noBasement. These will be set to zreo.

full[is.na(BsmtFullBath), BsmtFullBath := "0"]
full[is.na(BsmtHalfBath), BsmtHalfBath := "0"]

4.2 Multiple Imputation

BldgType and LotFrontage have around 10-15% missing values. These will be imputed by Gibbs sampling of the posterior distribution. Variables with missing values are repeatedly predicted given a set of predictor variables. For LotFrontage predictive mean matching, and for BldgType polytomous regression will be used. As predictors all continuous variables and the the 3 categorical variables which are most correlated to BldgType are included. Several datasets are created by this method but only one will be used.

vars <- c(contVars, "BldgType", "MSSubClass", "BedroomAbvGr", "TotRmsAbvGrd")
dt <- full[, vars, with = FALSE]
imp <- mice(dt, seed = 42)

LotFrontage

The distribution of LotFrontage after imputation is compared to the original distribution.

df <- data.frame(
  LotFrontage = c(dt$LotFrontage, complete(imp)$LotFrontage),
  Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = LotFrontage, color = Distro)) +
  geom_density() +
  theme_bw()

Both distributions look very similar. Furthermore, its correlation with the 4 most correlated continuous variables is compared.

df <- data.frame(
  Value = c(dt$LotArea, dt$MasVnrArea, dt$GrLivArea, dt$GarageArea),
  Variable = rep(c("LotArea", "MasVnrArea", "GrLivArea", "GarageArea"), 
                 each = nrow(dt)),
  LotFrontage = rep(complete(imp)$LotFrontage, each = 4),
  isImputed = factor(is.na(dt$LotFrontage), levels = c(TRUE, FALSE))
)
ggplot(df, aes(x = LotFrontage, y = Value, color = isImputed))  +
  geom_point(size = .7) +
  facet_wrap( ~ Variable, scales = "free") +
  theme_bw()

Imputed values seem to resemble the shape of the original data.

BldgType

Imputed and original distributions are compared.

df <- data.frame(
  BldgType = c(as.character(dt$BldgType), as.character(complete(imp)$BldgType)),
  Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = BldgType, fill = Distro)) +
  geom_bar(position = "dodge") +
  scale_x_discrete() +
  theme_bw()

Next correlation to the closest correlated categorical feature MSSubClass before and after imputation is checked.

df <- data.frame(
  BldgType = c(as.character(dt$BldgType), as.character(complete(imp)$BldgType)),
  MSSubClass = rep(dt$MSSubClass, 2),
  Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = MSSubClass, fill = BldgType)) +
  geom_bar(position = "dodge") +
  facet_grid(Distro ~ .) +
  theme_bw()

Both

Finally, the correlation of both imputed variables is checked.

df <- data.frame(
  LotFrontage = rep(c(dt$LotFrontage, complete(imp)$LotFrontage), each = 2),
  BldgType = rep(c(as.character(dt$BldgType), 
                   as.character(complete(imp)$BldgType)), each = 2),
  Distro = rep(c("Original", "Imputed"), each = 2*nrow(dt))
)
ggplot(df, aes(x = LotFrontage, color = BldgType)) +
  geom_density() +
  facet_grid(Distro ~ .) +
  theme_bw()
## Warning: Removed 972 rows containing non-finite values (stat_density).

The distribution of \(BldgType = TwnhsE\) changed visibly. However, with less than 10% this value is quite underrepresented so its distribution estimate is less stable. All in all the imputed values seem to be useful